Standardtasks at the beginning

rm(list=ls()) #remove ALL objects 
cat("\014") # clear console window prior to new run
Sys.setenv(LANG = "en") #Let's keep stuff in English
Sys.setenv("R_REMOTES_NO_ERRORS_FROM_WARNINGS"=TRUE)
Sys.setlocale("LC_ALL")
options(scipen = 999)
options(dplyr.summarise.inform = FALSE)

Libraries and paths

packages<-c("tidyverse", "readxl", "vegan", "phyloseq", "QsRutils", "gridExtra", "data.table", "dunn.test", "microbiome","pairwiseAdonis", "tidytext", "DESeq2", "ggrepel", "eulerr", "microeco", "file2meco", "RColorBrewer", "ggpubr")
lapply(packages, library, character.only=TRUE)

inwd  <- "/Users/xinope/M_GU/R/RProjects/FRAM_16s_Chile/raw_data"
outwd <- "/Users/xinope/M_GU/R/RProjects/FRAM_16s_Chile/outputs"
setwd("/Users/xinope/M_GU/R/RProjects/FRAM_16s_Chile")

Notes

  1. R1.s2 removed due to low sequencing depth (<5000 reads)
  2. Two samples missing due to sequencing issues (R1.s1 & R2.03sed)

Input Data

# OTU table and taxonomy
input<- read_excel(paste0(inwd, "/V3Sweden_clean_otus_table.xlsx"), col_names = TRUE, col_types = NULL)
# Metadata
samples<- read_delim(paste0(inwd, "/sampling_sites.csv"), locale=locale(decimal_mark = ","))

Defining some functions

geo_mean = function(x, na.rm=TRUE){
  exp(sum(log(x[x > 0]), na.rm=na.rm) / length(x))
}

Environmental parameters

Several parameters were recorded during the FRAM Centre for Future Chemical Risk Assessment and Management Strategies field campaign in Chile 2018. Besides, the antibiotic pressure was included in the form of toxic units (TU_MIC).

# Sub-setting sample input file
parameter.order = c("TU_MIC","nitrate", "nitrite", "phosphate", "silicate", "temp", "light", "pH", "conductivity", "oxygen")

env_vars <- samples %>%
  mutate(sample_name = sample) %>%
  mutate_at(vars("TU_MIC"), as.numeric) %>%
  column_to_rownames(var = "sample") %>%
  select(sample_name, site, label, type, matrix, all_of(parameter.order))

# Define order of parameters
combine.order = c("nitrate Reference", "nitrate River", "nitrate Tributary",
                  "nitrite Reference", "nitrite River", "nitrite Tributary",
                  "phosphate Reference", "phosphate River", "phosphate Tributary",
                  "silicate Reference", "silicate River", "silicate Tributary",
                  "temp Reference", "temp River", "temp Tributary",
                  "light Reference", "light River", "light Tributary",
                  "pH Reference", "pH River", "pH Tributary",
                  "conductivity Reference", "conductivity River", "conductivity Tributary",
                  "oxygen Reference", "oxygen River", "oxygen Tributary",
                  "TU_MIC Reference", "TU_MIC River", "TU_MIC Tributary")

# Plotting environmental parameters
env_vars %>% select(matrix, type, nitrate, nitrite, phosphate, silicate, temp, light, pH, conductivity, oxygen, TU_MIC) %>%
  pivot_longer(cols = all_of(parameter.order), names_to= "parameters", values_to= "values") %>%
  filter(matrix == "Water") %>%
  mutate(parameters = factor(parameters, levels = parameter.order)) %>%
  unite(combine, parameters, type, sep= " ") %>%
  mutate(combine = factor(combine, levels = combine.order)) %>%
    ggplot(aes(x = combine, y = values, fill= combine)) +
  geom_boxplot(outlier.color = NA, colour = "black", alpha = 0.4, lwd=0.3) +
  labs(x = "Environmental parameters", y = "Units") +
  scale_y_log10() +
  theme_bw()+ theme(legend.position="none", strip.background = element_rect(colour="black", fill="white"))+
  theme(strip.background = element_rect(colour="black", fill="white"))+
  theme(axis.title.x=element_blank(), axis.text.x=element_text(angle=40,hjust=0.98,size=rel(1)), axis.text.y=element_text(size=rel(1)))

# Statistics
type.order<- c("Reference", "River", "Tributary")

Stats.EnvVars<- env_vars %>% select(matrix, type, nitrate, nitrite, phosphate, silicate, temp, light, pH, conductivity, oxygen, TU_MIC) %>%
  pivot_longer(cols = parameter.order, names_to= "parameters", values_to= "values") %>%
  filter(matrix == "Water") %>%
  mutate(parameters = factor(parameters, levels = parameter.order)) %>% 
  mutate_at("values", as.numeric) %>%
  mutate(type = factor(type, levels = type.order)) %>%
  group_by(parameters)%>%
  reframe(p.value = dunn.test(values, type, method= "bh")$P.adjusted)
print(Stats.EnvVars)

Significant differences:

Microbial community analysis using phyloseq

The amplicon data (16s rRNA) was delivered as a table. The table has embedded the abundance of operational taxonomic units (OTUs) together with assigned taxonomy. Therefore, table was organised in two files in order to built the phyloseq object.

# data frame with only OTUs
input_otu<- input %>% 
  select(!taxonomy)%>%
  column_to_rownames(var = "id")

# data frame with sample ID and taxonomy
input_tax<- input %>% 
  select(id, taxonomy)%>%
  separate(taxonomy, into=c("kingdom","phylum","class","order","family","genus","species"), sep=";", remove=TRUE, convert=TRUE)%>%
  mutate(kingdom = gsub("k__", "", kingdom))%>%
  mutate(phylum = gsub("p__", "", phylum))%>%
  mutate(class = gsub("c__", "", class))%>%
  mutate(order = gsub("o__", "", order))%>%
  mutate(family = gsub("f__", "", family))%>%
  mutate(genus = gsub("g__", "", genus))%>%
  mutate(species = gsub("s__", "", species))%>%
  mutate_all(na_if," ") %>%
  column_to_rownames(var = "id")

# transform data frames into matrices
matrix_otu<-as.matrix(input_otu)
matrix_tax<-as.matrix(input_tax)

Create phyloseq object

phyloseq uses OTU table, taxonomy table, and metadata. OTUs unassigned at phylum levels were removed. In addition, chloroplast and mitocondria were also removed.

#transform to phyloseq object
OTU_input = otu_table(matrix_otu, taxa_are_rows = TRUE)
TAX_input = tax_table(matrix_tax)
PHY_envars = sample_data(env_vars)

#make phyloseq object
OBJ_phyl<- phyloseq(OTU_input, TAX_input, PHY_envars)

# remove uncharacterised, chloroplast, and mitochondria
OBJ_phyl<- subset_taxa(OBJ_phyl, !is.na(phylum) & !phylum %in% c("","uncharacterized") & (class!="Chloroplast") & (family!="mitochondria"))

# back-up as raw object
OBJ_phyl_raw<-OBJ_phyl

# clean up
rm(input, input_otu, input_tax, matrix_otu, matrix_tax, samples, PHY_envars)

Checking sequencing depth and removing samples with low reads.

# check sequencing depth
OBJ_phyl_rare<- otu_table(OBJ_phyl)
class(OBJ_phyl_rare) <- "matrix" 

## you get a warning here, but this is what we need to have
OBJ_phyl_rare <- t(OBJ_phyl_rare) # transpose observations to rows

## plotting sequencing depth
rarecurve(OBJ_phyl_rare, step=100, lwd=2, ylab="OTU", cex=0.75)

# removing sample with low sequencing depth (<5000)
OBJ_phyl = subset_samples(OBJ_phyl, sample_name != "R1.s2")

Rarefaction and singletons

We tested the role of rarefaction as well as singletons (OTUs present exactly once). Thus, the phyloseq object is filtered as follows:

  • OBJS1 = NO rarefaction + with singletons
  • OBJS2 = NO rarefaction + removed singletons
  • OBJS3 = rarefaction + with singletons
  • OBJS4 = rarefaction + removed singletons

Each object will be used to estimate within sample diversity (alpha-diversity) and the results will be used to assess the role of rarefaction and singletons.

# OBJ_phyl = phyloseq object no rarefied
# OBJ_phyl_rar = phyloseq object rarefied

# rarefy without replacement
OBJ_phyl_rare= rarefy_even_depth(OBJ_phyl, rngseed=1, sample.size=0.99*min(sample_sums(OBJ_phyl)), replace=F)

# for OBJS1, OTUs with zero reads are removed
OBJS1 <- prune_taxa(taxa_sums(OBJ_phyl) > 0, OBJ_phyl)
OBJS2 <- filter_taxa(OBJ_phyl, function (x) {sum(x > 0) > 1}, prune=TRUE)
OBJS3 <- OBJ_phyl_rare
OBJS4 <- filter_taxa(OBJ_phyl_rare, function (x) {sum(x > 0) > 1}, prune=TRUE)

Within sample diversity (alpha-diversity)

We considered four diversity metrics: observed, Chao1, Shannon index, and evenness. Brief definition is provided:

  • Observed Taxa (Species richness) = Most simple metric. It just counts up the number of different taxa you observe in a sample at a given taxonomic level.
  • Chao1 = Estimate diversity from abundance data. It assumes that the number of observations for a taxa has a Poisson distribution and corrects for variance
  • Shannon Index = Estimator for both species richness and evenness, but with weight on the richness. The idea behind this metric is that the more species you observe, and the more even their abundances are, the higher the entropy, or the higher the uncertainty of predicting which species you would see next if you were to look at another read from this sample
indexes.order<- c("Observed", "Chao1", "Shannon")

# Define a list of phyloseq objects
phyloseq_objects <- list(OBJS1, OBJS2, OBJS3, OBJS4)

# Initialize an empty list to store the results
DFalpha_list <- list()

# Loop through the phyloseq objects
for (i in seq_along(phyloseq_objects)) {
  DFalpha <- data.frame(
    phyloseq::estimate_richness(phyloseq_objects[[i]], measures = indexes.order),
    "Type" = phyloseq::sample_data(phyloseq_objects[[i]])$type,
    "matrix" = phyloseq::sample_data(phyloseq_objects[[i]])$matrix,
    "sample_name" = phyloseq::sample_data(phyloseq_objects[[i]])$label
  )
  
  selected_cols <- c("sample_name", "Type", "matrix", "Observed", "Chao1", "Shannon")
  
  # Role of singletons was assessed.
  DFalpha <- DFalpha %>%
    select(all_of(selected_cols)) %>%
    mutate(singleton = ifelse(i %% 2 == 1, "TRUE", "FALSE")) %>%
    mutate(rarefaction = ifelse(i <= 2, "non_rarefied", "rarefied")) %>%
    mutate(Dataset = paste0("OBJS", i))
  
  DFalpha_list[[i]] <- DFalpha
}

# Combine all the data frames in the list
DFalphasets <- bind_rows(DFalpha_list)

# Normality test
DFalphasets %>%
  group_by(rarefaction, singleton)%>%
  summarise(statistic = shapiro.test(Observed)$statistic,
            p.value = shapiro.test(Observed)$p.value)
## # A tibble: 4 × 4
## # Groups:   rarefaction [2]
##   rarefaction  singleton statistic  p.value
##   <chr>        <chr>         <dbl>    <dbl>
## 1 non_rarefied FALSE         0.982 0.622   
## 2 non_rarefied TRUE          0.982 0.622   
## 3 rarefied     FALSE         0.907 0.000714
## 4 rarefied     TRUE          0.906 0.000690
# summary normality
# if p-value is < 0.05, then data is normally distributed
# non-rarefied = non-normal -> non-parametric
# rarefied = normal -> parametric

rm(DFalpha_list, DFalpha, phyloseq_objects)

alpha-diversity differences at different sampling sites.

We assessed differences between the type of sampling sites for each environmental compartment. Type of sampling sites mean “Reference” vs “River” vs “Tributary” sites. Each type of sampling site is linked to land-uses within the River Aconcagua basin.

matrix.order<- c("Water", "Biofilm", "Sediment")
dataset.order<- c("OBJS1", "OBJS2", "OBJS3", "OBJS4")

# Type = Reference vs River vs Tributaries
DFalphasets %>%
  ungroup()%>%
  pivot_longer(cols = all_of(indexes.order), names_to = "index", values_to = "values")%>%
  select(-c(sample_name))%>%
  mutate_at("values", as.numeric) %>%
  mutate(matrix = factor(matrix, levels = matrix.order)) %>%
  mutate(Type = factor(Type, levels = type.order)) %>%
  mutate(index = factor(index, levels = indexes.order)) %>%
  mutate(Dataset = factor(Dataset, levels = dataset.order)) %>%
  group_by(Dataset, index, matrix)%>%
  summarise(p_value = kruskal.test(values ~ Type)$p.value) %>%
  filter(p_value < 0.05)
## # A tibble: 2 × 4
## # Groups:   Dataset, index [2]
##   Dataset index    matrix   p_value
##   <fct>   <fct>    <fct>      <dbl>
## 1 OBJS1   Observed Sediment  0.0228
## 2 OBJS2   Observed Sediment  0.0228

Overall, we did not find major differences for sampling sites types (i.e., reference, river, tributary) for each of the environmental compartments (i.e., water, biofilm, and sediment). These findings were consistent between the different datasets (OBJS1-OBJS4). Nevertheless, a discrepancy showed up for the sediment compartment and particularly the observed diversity metric between non-rarefied (OBJS1 & OBJS2) and rarefied data (OBJS3 & OBJS4). Reference sediments were significantly more diverse than river and tributary sediments in non-rarefied datasets (OBJS1 & OBJS2), while rarefied data showed no differences. See figure below.

indexes.order2<- c("Observed","Chao1","Shannon","Evenness")

combine_sites.order2 =c("Water Reference", "Water River", "Water Tributary",
                        "Biofilm Reference", "Biofilm River", "Biofilm Tributary",
                        "Sediment Reference", "Sediment River", "Sediment Tributary")

DFalphasets %>%
  mutate(Evenness= (Shannon/log(Observed)))%>%
  pivot_longer(cols = all_of(indexes.order2), names_to = "index", values_to = "values")%>%
  unite(combine_sites, matrix, Type, sep= " ", remove = FALSE) %>%
  mutate(combine_sites = factor(combine_sites, levels = combine_sites.order2)) %>%
  mutate_at(vars(index), factor, levels = indexes.order2) %>%
  ggplot(aes(x = combine_sites, y = values, fill= combine_sites)) +
  geom_boxplot(outlier.color = NA, colour = "black", alpha = 0.4, lwd=0.3) +
  scale_fill_manual(values=c("#009E73", "#56B4E9", "#E69F00", "#009E73", "#56B4E9", "#E69F00", "#009E73", "#56B4E9", "#E69F00"))+
  geom_jitter(aes(shape= matrix), alpha = 0.5, size = 2, width = 1) +
  labs(x = "", y = "", title = "A) alpha-diversity per sampling type") +
  facet_wrap( Dataset ~ index, scales = "free", ncol = 4) +
  theme_bw()+ #theme(legend.position="none", strip.background = element_rect(colour="black", fill="white"))+
  theme(strip.background = element_rect(colour="black", fill="white"))+
  theme(axis.title.x=element_blank(), axis.text.x=element_text(angle=40,hjust=0.98,size=rel(1)), axis.text.y=element_text(size=rel(1)))

rm(OBJS2, OBJS3, OBJS4, OBJ_phyl, OBJ_phyl_raw, OBJ_phyl_rare)

Remarks regarding rarefaction and singletons

Overall, no major differences were determined between the four tested scenarios. Singletons did not affect the alpha-diversity outputs therefore scenarios considering singletons are selected (OBJS1 & OBJS3) for further downstream analysis. This decision comprises not excluding any data. However, we observed that rarefaction created distinct results only for the sediment compartment (see above). Finally, OBJS1 is selected for further downstream analysis.

Biological data can be rarefied to account for the differences in library size and heteroscedasticity (unequal variability between samples) within a dataset. However, rarefying biological count data is statistically inadmissible. It requires the omission of available valid data, which leads to a loss of power or decreased sensitivity (increase in Type-II error). This is evident in sample-wise comparisons when fractions of a sample or even whole samples are discarded to generate rarefied counts but also in differential abundance analysis which expects the inclusion of moderate to rare ASVs that are more likely to be part of the omitted data. Additionally, rarefying does not address overdispersion among biological replicates which results in an underestimation of uncertainty due to an unacceptably high rate of Type-I errors and therefore, decreased specificity. Furthermore, the selection of the minimum threshold for the library size is arbitrary which influences downstream inference. Finally, the random subsampling step in rarefying adds additional uncertainty to the biological data (McMurdie und Holmes 2014).

Bacterial composition

#Convert to relative abundance
OBJS1rel = phyloseq::transform_sample_counts(OBJS1, function(x){x / sum(x)})

# phyloseq object must be converted to dataframe for plotting
DFOBJS1rel <- psmelt(OBJS1rel)

sample.order<- c("RS1.s1","RS1.s2","RS2.s1","RS2.s2","RS3.s1","RS3.s2","R1.s2","R2.s1","R2.s2","R3.s1","R3.s2","T1.s1","T1.s2","T2.s1","T2.s2","T3.s1","T3.s2","RS1.01sed","RS1.02sed","RS1.03sed","RS2.01sed","RS2.02sed","RS2.03sed","RS3.01sed","RS3.02sed","RS3.03sed","R1.01sed","R1.02sed","R1.03sed","R2.01sed","R2.02sed","R3.01sed","R3.02sed","R3.03sed","T1.01sed","T1.02sed","T1.03sed","T2.01sed","T2.02sed","T2.03sed","T3.01sed","T3.02sed","T3.03sed","RS1.eDNA","RS2.eDNA","RS3.eDNA","R1.eDNA","R2.eDNA","R3.eDNA","T1.eDNA","T2.eDNA","T3.eDNA")

# Plotting community composition at phylum level
DFOBJS1rel %>% mutate_at(vars(Sample), factor, levels = sample.order) %>%
  mutate_at(vars(matrix), factor, levels = matrix.order) %>%
ggplot(aes(x= factor(Sample), y=Abundance, fill=phylum))+
  geom_bar(aes(color = phylum, fill = phylum), stat = "identity", position = "stack") +
  labs(x = "", y = "Relative Abundance\n", title = "Bacterial community composition") +
  facet_wrap( ~ matrix, ncol= 3, scales = "free") +
  theme_bw()+ theme(strip.background = element_rect(colour="black", fill="white"))+
  theme(axis.title.x=element_blank(),
        axis.text.x=element_text(angle=40,hjust=0.98,size=rel(1)),
        axis.text.y=element_text(size=rel(1)))

rm(DFOBJS1rel)

Four phyla dominated the composition of the bacterial communities. Proteobacteria, Bacteroidetes, and Actinobacteria in all environmental compartments. Cyanobacteria only in the biofilm compartment (see below).

Most abundant phyla - composition

#Agglomerate to phylum-level and rename

OBJS1rel_diff <- phyloseq::tax_glom(OBJS1rel, "phylum")
phyloseq::taxa_names(OBJS1rel_diff) <- phyloseq::tax_table(OBJS1rel_diff)[, "phylum"]

my_comparisons <- list( c("Water", "Biofilm"), c("Water", "Sediment"), c("Biofilm", "Sediment") )

# specific phyla (most abundant)
phyloseq::psmelt(OBJS1rel_diff) %>%
  filter(phylum %in% c(" Actinobacteria", " Bacteroidetes", " Cyanobacteria", " Proteobacteria")) %>%
  mutate_at(vars(matrix), factor, levels = matrix.order) %>%
  ggplot(data = ., aes(x = matrix, y = Abundance)) +
  geom_point(aes(shape=type, colour= type), position = "jitter", size = 2) +
  #scale_fill_manual(values = c("#009E73", "#56B4E9", "#E69F00"))+
  stat_summary(fun= median, geom="crossbar", width=1, color="black") +
  #stat_compare_means(comparisons = my_comparisons, label.y = c(0.7, 0.8, 0.9))+
  stat_compare_means(comparisons = my_comparisons, label = "p.signif")+
  labs(x = "", y = "Relative Abundance\n") +
  facet_wrap(~ phylum, scales = "free", ncol = 5)+
  theme_bw()+ theme(strip.background = element_rect(colour="black", fill="white"))+
  theme(axis.title.x=element_blank(),
        axis.text.x=element_text(angle=40,hjust=0.98,size=rel(1)),
        axis.text.y=element_text(size=rel(1)))

# statistics
phyloseq::psmelt(OBJS1rel_diff) %>%
  filter(phylum %in% c(" Actinobacteria", " Bacteroidetes", " Cyanobacteria", " Proteobacteria")) %>%
  mutate_at(vars(matrix), factor, levels = matrix.order) %>%
  select(phylum, Abundance, label, type, matrix) %>%
  mutate_at("Abundance", as.numeric) %>%
  group_by(phylum)%>%
  reframe(p.value = dunn.test(Abundance, matrix, method= "bh")$P.adjusted)

rm(my_comparisons, OBJS1rel_diff)

Venn diagram (unique and shared OTUs)

# Create new objects based on environmental matrix
wat_phy <- subset_samples(OBJS1, matrix == "Water")
sed_phy <- subset_samples(OBJS1, matrix == "Sediment")
bio_phy <- subset_samples(OBJS1, matrix == "Biofilm")

# convert to relative abundances each matrix
Wpseq.rel <- microbiome::transform(wat_phy, "compositional")
Spseq.rel <- microbiome::transform(sed_phy, "compositional")
Bpseq.rel <- microbiome::transform(bio_phy, "compositional")

#make a list of sampling site types
type <- unique(as.character(meta(Wpseq.rel)$type))

# Specify colors in the order they appear in list_core
mycols_type <- c(Reference="#00d99e", River="#00a8d9", Tributary="#E69F00") 

#Write a for loop to go through the type category and combine identified core taxa into a list
#Loop for Water
WaterUnique <- c() # an empty object to store information

for (n in type){ # for each variable n in water phyloseq object
  Wps.sub <- subset_samples(Wpseq.rel, type == n) # Choose sample from object
  
  Wcore_m <- core_members(Wps.sub, # ps.sub is phyloseq selected with only samples from g 
                          detection = 0.0001, # 0.0001 in at least 99% samples 
                          prevalence = 0.75)
  print(paste0("No. of core taxa in ", n, " : ", length(Wcore_m))) # print core taxa identified in each matrix
  WaterUnique[[n]] <- Wcore_m # add to a list core taxa for each group.
}
## [1] "No. of core taxa in Reference : 205"
## [1] "No. of core taxa in River : 167"
## [1] "No. of core taxa in Tributary : 401"
#Loop for Biofilm
BioUnique <- c() # an empty object to store information

for (n in type){ # for each variable n in water phyloseq object
  Bps.sub <- subset_samples(Bpseq.rel, type == n) # Choose sample from object
  
  Bcore_m <- core_members(Bps.sub, # ps.sub is phyloseq selected with only samples from g 
                          detection = 0.0001, # 0.0001 in at least 99% samples 
                          prevalence = 0.75)
  print(paste0("No. of core taxa in ", n, " : ", length(Bcore_m))) # print core taxa identified in each matrix
  BioUnique[[n]] <- Bcore_m # add to a list core taxa for each group.
  #print(list_core)
}
## [1] "No. of core taxa in Reference : 182"
## [1] "No. of core taxa in River : 253"
## [1] "No. of core taxa in Tributary : 165"
#Loop for Sediment
SedUnique <- c() # an empty object to store information

for (n in type){ # for each variable n in water phyloseq object
  Sps.sub <- subset_samples(Spseq.rel, type == n) # Choose sample from object
  
  Score_m <- core_members(Sps.sub, # ps.sub is phyloseq selected with only samples from g 
                          detection = 0.0001, # 0.0001 in at least 99% samples 
                          prevalence = 0.75)
  print(paste0("No. of core taxa in ", n, " : ", length(Score_m))) # print core taxa identified in each matrix
  SedUnique[[n]] <- Score_m # add to a list core taxa for each group.
  #print(list_core)
}
## [1] "No. of core taxa in Reference : 485"
## [1] "No. of core taxa in River : 397"
## [1] "No. of core taxa in Tributary : 412"
# plotting Venn
Wvenn<- plot(venn(WaterUnique), fills = mycols_type)
Bvenn<- plot(venn(BioUnique), fills = mycols_type)
Svenn<- plot(venn(SedUnique), fills = mycols_type)

grid.arrange(Wvenn, Bvenn, Svenn, ncol = 3)

rm(BioUnique, SedUnique, WaterUnique, Bps.sub, Bpseq.rel, Sps.sub, Spseq.rel, Wps.sub, Wpseq.rel, Wvenn, Bvenn, Svenn)

Each compartment showed a distinct pattern regarding the number of core OTUs. Water-OTUs had the highest abundances at tributary sites and the lowest at the main course of the river. The main course of the river showed the lowest number of unique OTUs (n=38) as well as the lowest shared OTUs with the reference sites (n=6). Conversely, biofilm-OTUs showed the lowest core OTUs in tributary sites and the highest in the main river course. Finally, sediments had similar numbers of core OTUs within the river basin. Although, tributary had the lowest number of unique OTUs like biofilms.

Between sample diversity (beta-diversity)

We applied a compositional beta diversity metric rooted in a centered log-ratio transformation (clr-transformation). By accounting for the sparse compositional nature of microbiome data sets, robust Aitchison PCA can yield high discriminatory power and salient feature ranking between microbial niches (Martino et al., 2019).

# Euclidean distance of clr-transformed compositions (called the Aitchison distance)
(OBJS1rclr <- microbiome::transform(OBJS1, transform= "rclr"))

# Generate distance matrix - Euclidean distances
OBJS1rclrdist<- phyloseq::distance(OBJS1rclr, method = "euclidean")

# betadisper is a multivariate analogue of Levene's test for homogeneity of variances
# This procedure has latterly been used as a means of assessing beta diversity.
# Two distance matrices were calculated based on: 1) environmental compartment ("matrix") and type of sampling site ("type")
dispr_matrix <- vegan::betadisper(OBJS1rclrdist, phyloseq::sample_data(OBJS1rclr)$matrix)
dispr_type <- vegan::betadisper(OBJS1rclrdist, phyloseq::sample_data(OBJS1rclr)$type)

# eigen values environmental compartments (water, biofilm, sediment)
sapply(dispr_matrix$eig[1:5], function(x) x / sum(dispr_matrix$eig))
# eigen values sampling site type (reference, tributary, river)
sapply(dispr_type$eig[1:5], function(x) x / sum(dispr_type$eig))

#Scale axes and plot ordination
eig1_matrix <- (dispr_matrix$eig[1] / sum(dispr_matrix$eig))*100
eig2_matrix <- (dispr_matrix$eig[2] / sum(dispr_matrix$eig))*100

eig1_type <- (dispr_type$eig[1] / sum(dispr_type$eig))*100
eig2_type <- (dispr_type$eig[2] / sum(dispr_type$eig))*100

# extract the centroids and the site points in multivariate space.  
centroids_matrix<- data.frame(grps=rownames(dispr_matrix$centroids), data.frame(dispr_matrix$centroids))
vectors_matrix<- data.frame(group= dispr_matrix$group, data.frame(dispr_matrix$vectors ))

centroids_type<- data.frame(grps=rownames(dispr_type$centroids), data.frame(dispr_type$centroids))
vectors_type<- data.frame(group= dispr_type$group, data.frame(dispr_type$vectors ))

# rearrange sites for further plotting
rank_matrix <- c("Biofilm","Sediment", "Water")
vectors_matrix<- vectors_matrix %>% arrange(sapply(group, function(y) which(y == rank_matrix)))

rank_type <- c("Reference","Tributary", "River")
vectors_type<- vectors_type %>% arrange(sapply(group, function(y) which(y == rank_type)))

# to create the lines from the centroids to each point we will put it in a format that ggplot can handle
seg.matrix<- cbind(vectors_matrix[,1:3], centroids_matrix[rep(1:nrow(centroids_matrix),as.data.frame(table(vectors_matrix$group))$Freq),2:3])
names(seg.matrix)<-c("group","v.PCoA1","v.PCoA2","PCoA1","PCoA2")

seg.type<- cbind(vectors_type[,1:3], centroids_type[rep(1:nrow(centroids_type),as.data.frame(table(vectors_type$group))$Freq),2:3])
names(seg.type)<-c("group","v.PCoA1","v.PCoA2","PCoA1","PCoA2")

# create the convex hulls of the outermost points
grp1.wat<-seg.matrix[seg.matrix$group=="Water",1:3][chull(seg.matrix[seg.matrix$group=="Water",2:3]),]
grp2.sed<-seg.matrix[seg.matrix$group=="Sediment",1:3][chull(seg.matrix[seg.matrix$group=="Sediment",2:3]),]
grp3.bio<-seg.matrix[seg.matrix$group=="Biofilm",1:3][chull(seg.matrix[seg.matrix$group=="Biofilm",2:3]),]
all.matrix<-rbind(grp1.wat, grp2.sed, grp3.bio)

grp1.ref<-seg.type[seg.type$group=="Reference",1:3][chull(seg.type[seg.type$group=="Reference",2:3]),]
grp2.tri<-seg.type[seg.type$group=="Tributary",1:3][chull(seg.type[seg.type$group=="Tributary",2:3]),]
grp3.riv<-seg.type[seg.type$group=="River",1:3][chull(seg.type[seg.type$group=="River",2:3]),]
all.type<-rbind(grp1.ref, grp2.tri, grp3.riv)

# rowname to first columns for further plotting
setDT(seg.matrix, keep.rownames = TRUE)[]
setDT(seg.type, keep.rownames = TRUE)[]

beta_matrix<-ggplot() + 
  geom_polygon(data=all.matrix[all.matrix=="Water",],aes(x=v.PCoA1,y=v.PCoA2),colour="#0000ff",alpha=0,linetype="solid") +
  geom_polygon(data=all.matrix[all.matrix=="Sediment",],aes(x=v.PCoA1,y=v.PCoA2),colour="#8b4513",alpha=0,linetype="solid") +
  geom_polygon(data=all.matrix[all.matrix=="Biofilm",],aes(x=v.PCoA1,y=v.PCoA2),colour="#32cd32",alpha=0,linetype="solid") +
  geom_segment(data=seg.matrix,aes(x= v.PCoA1, xend=PCoA1, y=v.PCoA2, yend=PCoA2),alpha=0.30) + 
  geom_point(data=centroids_matrix[,1:3], aes(x=PCoA1,y=PCoA2, shape=grps, color= grps), size=4) + 
  scale_color_manual(values=c("#32cd32", "#8b4513", "#0000ff"))+
  geom_text(data=centroids_matrix[,1:3], aes(x=PCoA1,y=PCoA2,label=grps),hjust=-0.2, vjust=-1.5, size=3)+
  geom_point(data=seg.matrix, aes(x=v.PCoA1,y=v.PCoA2, shape=group), size=2, alpha= 0.30) +
  geom_text(data=seg.matrix, aes(x=v.PCoA1,y=v.PCoA2,label=rn),hjust=-0.2, vjust=-1.5, size=3)+
  labs(title="A) Beta-diversity based on environmental matrix") +
  labs(x = paste("PCoA 1 (", round(eig1_matrix[[1]],1), "%", ")", sep = ""),
       y = paste("PCoA 2 (", round(eig2_matrix[[1]],1), "%", ")", sep = ""))+
  coord_cartesian(xlim = c(-30, 40), ylim = c(-30, 30)) +
  theme_bw()+
  theme(legend.position="none")

beta_sites<- ggplot() + 
  geom_polygon(data=all.type[all.type=="River",],aes(x=v.PCoA1,y=v.PCoA2),colour="#0000ff",alpha=0,linetype="solid") +
  geom_polygon(data=all.type[all.type=="Tributary",],aes(x=v.PCoA1,y=v.PCoA2),colour="#8b4513",alpha=0,linetype="solid") +
  geom_polygon(data=all.type[all.type=="Reference",],aes(x=v.PCoA1,y=v.PCoA2),colour="#32cd32",alpha=0,linetype="solid") +
  geom_segment(data=seg.type, aes(x= v.PCoA1, xend=PCoA1, y=v.PCoA2, yend=PCoA2),alpha=0.30) + 
  geom_point(data= centroids_type[,1:3], aes(x=PCoA1,y=PCoA2, shape=grps, color= grps), size=4) + 
  scale_color_manual(values=c("#32cd32", "#0000ff", "#8b4513"))+
  geom_text(data= centroids_type[,1:3], aes(x=PCoA1,y=PCoA2,label=grps),hjust=-0.2, vjust=-1.5, size=3)+
  geom_point(data= seg.type, aes(x=v.PCoA1,y=v.PCoA2, shape=group), size=2, alpha= 0.30) +
  geom_text(data= seg.type, aes(x=v.PCoA1,y=v.PCoA2,label=rn),hjust=-0.2, vjust=-1.5, size=3)+
  labs(title="B) Beta-diversity based on sampling sites") +
  labs(x = paste("PCoA 1 (", round(eig1_type[[1]],1), "%", ")", sep = ""),
       y = paste("PCoA 2 (", round(eig2_type[[1]],1), "%", ")", sep = ""))+
  coord_cartesian(xlim = c(-30, 40), ylim = c(-30, 30)) +
  theme_bw()+
  theme(legend.position="none")

grid.arrange(beta_matrix, beta_sites, ncol = 2)

rm(grp1.ref, grp1.wat, grp2.sed, grp2.tri, grp3.bio, grp3.riv, centroids_matrix, centroids_type, vectors_matrix, vectors_type, seg.matrix, seg.type, all.matrix, all.type)

Permutational analysis of variance (PERMANOVA)

PERMANOVA is a widely used non-parametric multivariate method that can be used to estimate the actual statistical significance of differences in the observed community composition between two groups of samples. PERMANOVA evaluates the hypothesis that the centroids and dispersion of the community are equivalent between the compared groups. A small p-value indicates that the compared groups have, on average, a different community composition.

#ADONIS test (environmental matrices)
pairwise.adonis(OBJS1rclrdist, phyloseq::sample_data(OBJS1rclr)$matrix, p.adjust.m = "BH", perm = 999)
##                 pairs Df SumsOfSqs  F.Model         R2 p.value p.adjusted sig
## 1   Water vs Sediment  1  5407.084 2.902526 0.08084462   0.001      0.001  **
## 2    Water vs Biofilm  1  4398.111 2.752426 0.10688028   0.001      0.001  **
## 3 Sediment vs Biofilm  1  9256.228 4.983876 0.11079249   0.001      0.001  **

Since the environmental compartment are different, there is not statistical support to consider the community as a whole and thus we split the data. We assess if the distance between sampling sites is relevant in the struxturing of the bacterial communities in water, biofilm, and sediment.

# subset per matrix
OBJwater<- subset_samples(OBJS1, matrix == "Water")
OBJbiofilm<- subset_samples(OBJS1, matrix == "Biofilm")
OBJsed<- subset_samples(OBJS1, matrix == "Sediment")

# Aitchison distance
(OBJwater <- microbiome::transform(OBJwater, transform= "rclr"))
(OBJbiofilm <- microbiome::transform(OBJbiofilm, transform= "rclr"))
(OBJsed <- microbiome::transform(OBJsed, transform= "rclr"))

rOBJwaterdist<- phyloseq::distance(OBJwater, method = "euclidean")
rOBJbiofilmdist<- phyloseq::distance(OBJbiofilm, method = "euclidean")
rOBJseddist<- phyloseq::distance(OBJsed, method = "euclidean")

PERMANOVA between sampling site per each environmental compartment

Multiple comparison between reference, tributary, and river sites per each environmental compartment.

# Water
pairwise.adonis(rOBJwaterdist, phyloseq::sample_data(OBJwater)$type, p.adjust.m = "BH", perm = 999)
##                    pairs Df SumsOfSqs  F.Model        R2 p.value p.adjusted sig
## 1     Reference vs River  1  1616.129 1.160081 0.2248183     0.3        0.4    
## 2 Reference vs Tributary  1  2231.212 1.540278 0.2780146     0.1        0.3    
## 3     River vs Tributary  1  1432.516 1.035969 0.2057140     0.4        0.4
# Biofilm
pairwise.adonis(rOBJbiofilmdist, phyloseq::sample_data(OBJbiofilm)$type, p.adjust.m = "BH", perm = 999)
##                    pairs Df SumsOfSqs  F.Model        R2 p.value p.adjusted sig
## 1     River vs Reference  1  3282.259 2.222449 0.2174087   0.006     0.0180   .
## 2     River vs Tributary  1  2259.972 1.461131 0.1544351   0.096     0.0960    
## 3 Reference vs Tributary  1  2260.417 1.501073 0.1305159   0.037     0.0555
# Sediment
pairwise.adonis(rOBJseddist, phyloseq::sample_data(OBJsed)$type, p.adjust.m = "BH", perm = 999)
##                    pairs Df SumsOfSqs  F.Model        R2 p.value p.adjusted sig
## 1     Reference vs River  1  5535.533 2.950968 0.1643904   0.001     0.0015   *
## 2 Reference vs Tributary  1  4716.779 2.415936 0.1311873   0.001     0.0015   *
## 3     River vs Tributary  1  2679.511 1.797206 0.1069944   0.004     0.0040   *
rm(beta_matrix, beta_sites, dispr_matrix, dispr_type, OBJS1rclr)

Remarks regarding beta-diversity

  • Water, biofilm, and sediment bacterial communities are significantly different between each other.
  • No significant differences in the water compartment. Likely due to poor replication.
  • River and tributary sites different to reference in the biofilm compartment.
  • All sediment sites different between each other.

Role of environmental variables shaping bacterial community structure.

we performed a distance-based redundancy analysis (dbRDA) to generate a constrained ordination diagram, using the significant main effects and interaction terms determined from the PERMANOVA analysis as categorical predictor variables (Anderson et al. 2008). Distance-based redundancy analysis is a direct analogue to traditional redundancy analysis (RDA), but is more flexible in that it can be used with non-Euclidean measures of distance (such as Bray–Curtis dissimilarity) that are often more appropriate for the analysis of ecological data (Legendre & Anderson 1999). dbRDA was conducted only in biofilm and sediments compartments since these were the compartment showing significant variation between sampling sites.

# Objects already subsetted and clr-transformed (see above)
OBJbiofilm
OBJsed

# RDA or dbRDA work using a dataframe of OTUs and environmental variables from the phyloseq object
# extract the OTU table
Y_bio <- veganotu(OBJbiofilm)
Y_sed <- veganotu(OBJsed)

# extract the sample data from the 'phyloseq' object
DT_bio <- data.frame(sample_data(OBJbiofilm))
DT_sed <- data.frame(sample_data(OBJsed))

# continuous (numeric) variables
DT_bio_num <- select_if(DT_bio, is.numeric)
DT_sed_num <- select_if(DT_sed, is.numeric)

# select the continuous (numeric) variables to use in the constraint (Y) then standardise
X_bio <- decostand(DT_bio_num, method='hellinger')
X_sed <- decostand(DT_sed_num, method='hellinger')

We tested which distance metric suits the best our data. Euclidean distances were ranked the highest and thus used as distance index within the dbRDA.

# distances
rankindex(X_bio, Y_bio, indices = c("euc", "man", "gow","bra", "kul"), stepacross= FALSE, method = "spearman")
##          euc          man          gow          bra          kul 
##  0.132134371 -0.079319539 -0.196429479 -0.003894893  0.047681679
rankindex(X_sed, Y_sed, indices = c("euc", "man", "gow","bra", "kul"), stepacross= FALSE, method = "spearman")
##         euc         man         gow         bra         kul 
##  0.20247372  0.18516821  0.16356833 -0.20789536 -0.03987936
# note the result is the same as for RDA when using euclidean distances
res_bio <- capscale(Y_bio ~ ., data=X_bio, distance= 'euc')
res_sed <- capscale(Y_sed ~ ., data=X_sed, distance= 'euc')

# summary of the results
summary(res_bio, display=NULL)
## 
## Call:
## capscale(formula = Y_bio ~ TU_MIC + nitrate + nitrite + phosphate +      silicate + temp + light + pH + conductivity + oxygen, data = X_bio,      distance = "euc") 
## 
## Partitioning of mean squared Euclidean distance:
##               Inertia Proportion
## Total          1652.2     1.0000
## Constrained     989.8     0.5991
## Unconstrained   662.4     0.4009
## 
## Eigenvalues, and their contribution to the mean squared Euclidean distance 
## 
## Importance of components:
##                           CAP1     CAP2      CAP3      CAP4     CAP5     CAP6
## Eigenvalue            270.7105 189.5220 155.32660 126.33792 96.71303 90.84538
## Proportion Explained    0.1638   0.1147   0.09401   0.07647  0.05854  0.05498
## Cumulative Proportion   0.1638   0.2786   0.37257   0.44904  0.50757  0.56256
##                           CAP7      MDS1     MDS2     MDS3     MDS4     MDS5
## Eigenvalue            60.38319 135.13198 115.4834 95.93505 75.31354 66.76526
## Proportion Explained   0.03655   0.08179   0.0699  0.05807  0.04558  0.04041
## Cumulative Proportion  0.59911   0.68089   0.7508  0.80886  0.85444  0.89485
##                           MDS6     MDS7     MDS8
## Eigenvalue            63.84677 57.63783 52.24231
## Proportion Explained   0.03864  0.03489  0.03162
## Cumulative Proportion  0.93349  0.96838  1.00000
## 
## Accumulated constrained eigenvalues
## Importance of components:
##                           CAP1     CAP2     CAP3     CAP4     CAP5     CAP6
## Eigenvalue            270.7105 189.5220 155.3266 126.3379 96.71303 90.84538
## Proportion Explained    0.2735   0.1915   0.1569   0.1276  0.09771  0.09178
## Cumulative Proportion   0.2735   0.4650   0.6219   0.7495  0.84722  0.93900
##                         CAP7
## Eigenvalue            60.383
## Proportion Explained   0.061
## Cumulative Proportion  1.000
## 
## Scaling 2 for species and site scores
## * Species are scaled proportional to eigenvalues
## * Sites are unscaled: weighted dispersion equal on all dimensions
## * General scaling constant of scores:
summary(res_sed, display=NULL)
## 
## Call:
## capscale(formula = Y_sed ~ TU_MIC + nitrate + nitrite + phosphate +      silicate + temp + light + pH + conductivity + oxygen, data = X_sed,      distance = "euc") 
## 
## Partitioning of mean squared Euclidean distance:
##               Inertia Proportion
## Total          1980.3     1.0000
## Constrained     924.6     0.4669
## Unconstrained  1055.7     0.5331
## 
## Eigenvalues, and their contribution to the mean squared Euclidean distance 
## 
## Importance of components:
##                           CAP1      CAP2      CAP3      CAP4     CAP5     CAP6
## Eigenvalue            266.0886 150.03464 129.59938 123.28043 86.84409 70.62204
## Proportion Explained    0.1344   0.07577   0.06545   0.06225  0.04385  0.03566
## Cumulative Proportion   0.1344   0.21014   0.27558   0.33784  0.38169  0.41735
##                           CAP7    CAP8      MDS1      MDS2     MDS3     MDS4
## Eigenvalue            62.88538 35.2448 166.12468 119.43855 84.99138 70.80453
## Proportion Explained   0.03176  0.0178   0.08389   0.06031  0.04292  0.03576
## Cumulative Proportion  0.44911  0.4669   0.55080   0.61111  0.65403  0.68979
##                           MDS5     MDS6     MDS7     MDS8     MDS9    MDS10
## Eigenvalue            66.83827 61.61598 59.11491 54.53460 53.13408 49.60325
## Proportion Explained   0.03375  0.03112  0.02985  0.02754  0.02683  0.02505
## Cumulative Proportion  0.72354  0.75466  0.78451  0.81205  0.83888  0.86393
##                         MDS11    MDS12    MDS13    MDS14    MDS15    MDS16
## Eigenvalue            48.7090 46.90284 41.16314 39.29550 32.70961 31.99295
## Proportion Explained   0.0246  0.02369  0.02079  0.01984  0.01652  0.01616
## Cumulative Proportion  0.8885  0.91221  0.93300  0.95284  0.96936  0.98551
##                          MDS17
## Eigenvalue            28.68653
## Proportion Explained   0.01449
## Cumulative Proportion  1.00000
## 
## Accumulated constrained eigenvalues
## Importance of components:
##                           CAP1     CAP2     CAP3     CAP4     CAP5     CAP6
## Eigenvalue            266.0886 150.0346 129.5994 123.2804 86.84409 70.62204
## Proportion Explained    0.2878   0.1623   0.1402   0.1333  0.09393  0.07638
## Cumulative Proportion   0.2878   0.4501   0.5902   0.7236  0.81749  0.89387
##                           CAP7     CAP8
## Eigenvalue            62.88538 35.24482
## Proportion Explained   0.06801  0.03812
## Cumulative Proportion  0.96188  1.00000
## 
## Scaling 2 for species and site scores
## * Species are scaled proportional to eigenvalues
## * Sites are unscaled: weighted dispersion equal on all dimensions
## * General scaling constant of scores:
# Testing the level of significance of the biofilm and sediment dbRDA models
anova.cca(res_bio, perm.max=999)
## Permutation test for capscale under reduced model
## Permutation: free
## Number of permutations: 999
## 
## Model: capscale(formula = Y_bio ~ TU_MIC + nitrate + nitrite + phosphate + silicate + temp + light + pH + conductivity + oxygen, data = X_bio, distance = "euc")
##          Df Variance      F Pr(>F)    
## Model     7   989.84 1.7079  0.001 ***
## Residual  8   662.36                  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova.cca(res_sed, perm.max=999)
## Permutation test for capscale under reduced model
## Permutation: free
## Number of permutations: 999
## 
## Model: capscale(formula = Y_sed ~ TU_MIC + nitrate + nitrite + phosphate + silicate + temp + light + pH + conductivity + oxygen, data = X_sed, distance = "euc")
##          Df Variance      F Pr(>F)    
## Model     8    924.6 1.8612  0.001 ***
## Residual 17   1055.7                  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# by environmental variables
anova.cca(res_bio, by= "terms", perm.max=999) 
## Permutation test for capscale under reduced model
## Terms added sequentially (first to last)
## Permutation: free
## Number of permutations: 999
## 
## Model: capscale(formula = Y_bio ~ TU_MIC + nitrate + nitrite + phosphate + silicate + temp + light + pH + conductivity + oxygen, data = X_bio, distance = "euc")
##           Df Variance      F Pr(>F)    
## TU_MIC     1   102.54 1.2384  0.144    
## nitrate    1   226.50 2.7357  0.001 ***
## nitrite    1   146.00 1.7634  0.011 *  
## phosphate  1    96.34 1.1637  0.212    
## silicate   1   126.88 1.5324  0.032 *  
## temp       1   133.58 1.6134  0.020 *  
## light      1   158.00 1.9084  0.002 ** 
## Residual   8   662.36                  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova.cca(res_sed, by= "terms", perm.max=999) 
## Permutation test for capscale under reduced model
## Terms added sequentially (first to last)
## Permutation: free
## Number of permutations: 999
## 
## Model: capscale(formula = Y_sed ~ TU_MIC + nitrate + nitrite + phosphate + silicate + temp + light + pH + conductivity + oxygen, data = X_sed, distance = "euc")
##           Df Variance      F Pr(>F)    
## TU_MIC     1    96.60 1.5556  0.036 *  
## nitrate    1   166.26 2.6775  0.001 ***
## nitrite    1    99.81 1.6073  0.020 *  
## phosphate  1   145.35 2.3406  0.001 ***
## silicate   1   131.36 2.1153  0.002 ** 
## temp       1    94.62 1.5238  0.031 *  
## light      1   100.15 1.6128  0.016 *  
## pH         1    90.45 1.4566  0.033 *  
## Residual  17  1055.66                  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# New model using only significant variables
bio_sig<- capscale(Y_bio~ nitrate+nitrite+silicate+temp+light, data= X_bio, distance= 'euc')
sed_sig<- capscale(Y_sed~ TU_MIC + nitrate + nitrite + phosphate + silicate + temp + light + pH, data= X_sed, distance= 'euc')

# Testing the level of significance of the biofilm and sediment dbRDA models
anova.cca(bio_sig, perm.max=999)
## Permutation test for capscale under reduced model
## Permutation: free
## Number of permutations: 999
## 
## Model: capscale(formula = Y_bio ~ nitrate + nitrite + silicate + temp + light, data = X_bio, distance = "euc")
##          Df Variance      F Pr(>F)    
## Model     5   713.62 1.5207  0.001 ***
## Residual 10   938.57                  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova.cca(sed_sig, perm.max=999)
## Permutation test for capscale under reduced model
## Permutation: free
## Number of permutations: 999
## 
## Model: capscale(formula = Y_sed ~ TU_MIC + nitrate + nitrite + phosphate + silicate + temp + light + pH, data = X_sed, distance = "euc")
##          Df Variance      F Pr(>F)    
## Model     8    924.6 1.8612  0.001 ***
## Residual 17   1055.7                  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# by environmental variables
anova.cca(bio_sig, by= "terms", perm.max=999) 
## Permutation test for capscale under reduced model
## Terms added sequentially (first to last)
## Permutation: free
## Number of permutations: 999
## 
## Model: capscale(formula = Y_bio ~ nitrate + nitrite + silicate + temp + light, data = X_bio, distance = "euc")
##          Df Variance      F Pr(>F)    
## nitrate   1   231.64 2.4680  0.001 ***
## nitrite   1   116.38 1.2400  0.136    
## silicate  1   105.18 1.1206  0.233    
## temp      1   121.98 1.2996  0.106    
## light     1   138.45 1.4751  0.035 *  
## Residual 10   938.57                  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova.cca(sed_sig, by= "terms", perm.max=999) 
## Permutation test for capscale under reduced model
## Terms added sequentially (first to last)
## Permutation: free
## Number of permutations: 999
## 
## Model: capscale(formula = Y_sed ~ TU_MIC + nitrate + nitrite + phosphate + silicate + temp + light + pH, data = X_sed, distance = "euc")
##           Df Variance      F Pr(>F)    
## TU_MIC     1    96.60 1.5556  0.044 *  
## nitrate    1   166.26 2.6775  0.001 ***
## nitrite    1    99.81 1.6073  0.015 *  
## phosphate  1   145.35 2.3406  0.001 ***
## silicate   1   131.36 2.1153  0.003 ** 
## temp       1    94.62 1.5238  0.024 *  
## light      1   100.15 1.6128  0.019 *  
## pH         1    90.45 1.4566  0.049 *  
## Residual  17  1055.66                  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# final models
bio_sig<- capscale(Y_bio~ nitrate+light, data= X_bio, distance= 'euc')
sed_sig<- capscale(Y_sed~ TU_MIC + nitrate + nitrite + phosphate + silicate + temp + light + pH, data= X_sed, distance= 'euc')

# summary of the results
summary(bio_sig, display=NULL)
## 
## Call:
## capscale(formula = Y_bio ~ nitrate + light, data = X_bio, distance = "euc") 
## 
## Partitioning of mean squared Euclidean distance:
##               Inertia Proportion
## Total          1652.2     1.0000
## Constrained     324.8     0.1966
## Unconstrained  1327.4     0.8034
## 
## Eigenvalues, and their contribution to the mean squared Euclidean distance 
## 
## Importance of components:
##                           CAP1     CAP2     MDS1     MDS2      MDS3      MDS4
## Eigenvalue            235.3512 89.45539 199.2420 175.0111 139.19317 131.15529
## Proportion Explained    0.1424  0.05414   0.1206   0.1059   0.08425   0.07938
## Cumulative Proportion   0.1424  0.19659   0.3172   0.4231   0.50736   0.58674
##                            MDS5     MDS6     MDS7     MDS8    MDS9    MDS10
## Eigenvalue            116.77591 94.91604 88.80871 84.86334 74.5202 62.14433
## Proportion Explained    0.07068  0.05745  0.05375  0.05136  0.0451  0.03761
## Cumulative Proportion   0.65742  0.71487  0.76862  0.81998  0.8651  0.90270
##                          MDS11    MDS12    MDS13
## Eigenvalue            57.03078 52.45052 51.27663
## Proportion Explained   0.03452  0.03175  0.03104
## Cumulative Proportion  0.93722  0.96896  1.00000
## 
## Accumulated constrained eigenvalues
## Importance of components:
##                           CAP1    CAP2
## Eigenvalue            235.3512 89.4554
## Proportion Explained    0.7246  0.2754
## Cumulative Proportion   0.7246  1.0000
## 
## Scaling 2 for species and site scores
## * Species are scaled proportional to eigenvalues
## * Sites are unscaled: weighted dispersion equal on all dimensions
## * General scaling constant of scores:
summary(sed_sig, display=NULL)
## 
## Call:
## capscale(formula = Y_sed ~ TU_MIC + nitrate + nitrite + phosphate +      silicate + temp + light + pH, data = X_sed, distance = "euc") 
## 
## Partitioning of mean squared Euclidean distance:
##               Inertia Proportion
## Total          1980.3     1.0000
## Constrained     924.6     0.4669
## Unconstrained  1055.7     0.5331
## 
## Eigenvalues, and their contribution to the mean squared Euclidean distance 
## 
## Importance of components:
##                           CAP1      CAP2      CAP3      CAP4     CAP5     CAP6
## Eigenvalue            266.0886 150.03464 129.59938 123.28043 86.84409 70.62204
## Proportion Explained    0.1344   0.07577   0.06545   0.06225  0.04385  0.03566
## Cumulative Proportion   0.1344   0.21014   0.27558   0.33784  0.38169  0.41735
##                           CAP7    CAP8      MDS1      MDS2     MDS3     MDS4
## Eigenvalue            62.88538 35.2448 166.12468 119.43855 84.99138 70.80453
## Proportion Explained   0.03176  0.0178   0.08389   0.06031  0.04292  0.03576
## Cumulative Proportion  0.44911  0.4669   0.55080   0.61111  0.65403  0.68979
##                           MDS5     MDS6     MDS7     MDS8     MDS9    MDS10
## Eigenvalue            66.83827 61.61598 59.11491 54.53460 53.13408 49.60325
## Proportion Explained   0.03375  0.03112  0.02985  0.02754  0.02683  0.02505
## Cumulative Proportion  0.72354  0.75466  0.78451  0.81205  0.83888  0.86393
##                         MDS11    MDS12    MDS13    MDS14    MDS15    MDS16
## Eigenvalue            48.7090 46.90284 41.16314 39.29550 32.70961 31.99295
## Proportion Explained   0.0246  0.02369  0.02079  0.01984  0.01652  0.01616
## Cumulative Proportion  0.8885  0.91221  0.93300  0.95284  0.96936  0.98551
##                          MDS17
## Eigenvalue            28.68653
## Proportion Explained   0.01449
## Cumulative Proportion  1.00000
## 
## Accumulated constrained eigenvalues
## Importance of components:
##                           CAP1     CAP2     CAP3     CAP4     CAP5     CAP6
## Eigenvalue            266.0886 150.0346 129.5994 123.2804 86.84409 70.62204
## Proportion Explained    0.2878   0.1623   0.1402   0.1333  0.09393  0.07638
## Cumulative Proportion   0.2878   0.4501   0.5902   0.7236  0.81749  0.89387
##                           CAP7     CAP8
## Eigenvalue            62.88538 35.24482
## Proportion Explained   0.06801  0.03812
## Cumulative Proportion  0.96188  1.00000
## 
## Scaling 2 for species and site scores
## * Species are scaled proportional to eigenvalues
## * Sites are unscaled: weighted dispersion equal on all dimensions
## * General scaling constant of scores:
rm(X_bio, X_sed, Y_bio, Y_sed, DT_bio_num, DT_sed_num, res_bio, res_sed)

The models were reduced

# exploratory plots
plot(bio_sig, scaling=2, type= "text")

plot(sed_sig, scaling=2, type= "text")

Replotting dbRDA using ggplot

# extract % explained by the first two axes
perc_bio <- round(100*(summary(bio_sig)$cont$importance[2, 1:2]), 2)
perc_sed <- round(100*(summary(sed_sig)$cont$importance[2, 1:2]), 2)

## extract scores - these are coordinates in the RDA space
sit_bio <- cbind(DT_bio, scores(bio_sig, display= 'sites'))
sit_sed <- cbind(DT_sed, scores(sed_sig, display= 'sites'))

spp_bio <- cbind(data.frame(tax_table(OBJbiofilm)), scores(bio_sig, display='species'))
spp_sed <- cbind(data.frame(tax_table(OBJsed)), scores(sed_sig, display='species'))

# extracting variable vectors
vec_bio <- data.frame(scores(bio_sig, display='bp'))
vec_sed <- data.frame(scores(sed_sig, display='bp'))

# labing vectos
arrowdf_bio <- data.frame(labels = rownames(vec_bio), vec_bio)
arrowdf_sed <- data.frame(labels = rownames(vec_sed), vec_sed)

# setting arrows
arrow_map <- aes(xend = CAP1*6.162525, yend = CAP2*6.162525, x = 0, y = 0, shape = NULL, color = NULL, label = labels)
label_map <- aes(x = CAP1*7, y = CAP2*7, shape = NULL, color = NULL, label = labels)
arrowhead = arrow(length = unit(0.02, "npc"))

# Plotting

plot_bio<- ggplot(sit_bio, aes(x=CAP1, y=CAP2, color= type, shape= matrix)) +
  geom_point(size=3) + 
  scale_color_manual(values=c("#228B22", "#56B4E9", "#E69F00"))+
  geom_text(aes(label= site),hjust=-0.2, vjust=-1.5, size=3)+
  theme_bw()+
  geom_segment(mapping = arrow_map, size = .5, data = arrowdf_bio, color = "black", arrow = arrowhead) + 
  geom_text(mapping = label_map, size = 4,  data = arrowdf_bio, show.legend = FALSE)+
  labs(title="A) db-RDA biofilm communities") +
  labs(x = paste("PCoA 1 (", round(perc_bio[[1]],1), "%", ")", sep = ""),
       y = paste("PCoA 2 (", round(perc_bio[[2]],1), "%", ")", sep = ""))+
  theme(legend.position="none")

plot_sed<- ggplot(sit_sed, aes(x=CAP1, y=CAP2, color= type, shape= matrix)) +
  geom_point(size=3) + 
  scale_color_manual(values=c("#228B22", "#56B4E9", "#E69F00"))+
  geom_text(aes(label= site),hjust=-0.2, vjust=-1.5, size=3)+
  theme_bw()+
  geom_segment(mapping = arrow_map, size = .5, data = arrowdf_sed, color = "black", arrow = arrowhead) + 
  geom_text(mapping = label_map, size = 4,  data = arrowdf_sed, show.legend = FALSE)+
  labs(title="B) db-RDA sediment communities") +
  labs(x = paste("PCoA 1 (", round(perc_sed[[1]],1), "%", ")", sep = ""),
       y = paste("PCoA 2 (", round(perc_sed[[2]],1), "%", ")", sep = ""))+
  theme(legend.position="none")

grid.arrange(plot_bio, plot_sed, ncol = 2)

rm(arrowdf_bio, arrowdf_sed, arrow_map, arrowhead, label_map, sit_bio, sit_sed, spp_bio, spp_sed, vec_bio, vec_sed)

Remarks regarding beta-diversity

  • The role of environmental variables explaining the observed variance was only assessed in biofilm and sediment
  • We created final dbRDA models containing only significant variables
  • Nutrients represented by nitrate and light intensity were two variables explaining most of the variation in biofilms
    • Light intensity was significantly correlated with reference sites and one of the tributary sites (T2)
    • Nitrate correlated with the main river course and the rest of the tributary sites
  • Sediment variation was explained by several variables, all of them significant
    • Light intensity correlated with reference sites likewise in biofilms
    • Several nutrients played an explanatory role only in tributary sites
    • Antibacterial pressure, TU_MIC, correlated with the main river course, particularly the head and middle waters within the river basin

Differential abundance analysis (DAA)

DAA aims to find the differences in the abundance of each taxa between two samples, assigning a significance value to each comparison. DAA tool can help identify highly confident microbial candidates for further biological validation.

The DAA is based on the outputs from the dbRDA analysis where TU_MIC correlated with two sites in the main river course (R1 and R2) in sediments. Thus, comparisons are conducted between the reference sites and R1 and R2 (i.e., Reference vs R1 & Reference vs R2) to identified taxa being promoted and diminished.

# Sites to test based on dbRDA
# SEDIMENT Reference vs R1

DESeqSed<- subset_samples(OBJS1, matrix == "Sediment")

#convert phyloseq to deseq2 format
deseq_sed_sites = phyloseq_to_deseq2(DESeqSed, ~ label)

#remove genus without any counts
ds_sed_sites<- deseq_sed_sites[rowSums(counts(deseq_sed_sites)) > 0,]

# calculate geometric means prior to estimate size factors
geoMeans_sed_sites = apply(counts(ds_sed_sites), 1, geo_mean)

#calculate the size factor and add it to the data set
ds_sed_sites = estimateSizeFactors(ds_sed_sites, geoMeans = geoMeans_sed_sites)

# run deseq
# Note: The default multiple-inference correction is Benjamini-Hochberg, and occurs within the DESeq function.
DESeq_sed_sites = DESeq(ds_sed_sites, test="Wald", fitType="parametric")

# result summary overall comparison
res_desq_sed_sites = results(DESeq_sed_sites)

# results summary for selected sites
DESeq_sed_RefR1<- results(DESeq_sed_sites, contrast= c("label", "RS1", "R1"), cooksCutoff= FALSE, pAdjustMethod= "BH")
DESeq_sed_RefR2<- results(DESeq_sed_sites, contrast= c("label", "RS1", "R2"), cooksCutoff= FALSE, pAdjustMethod= "BH")

#order
DESeq_sed_RefR1 = DESeq_sed_RefR1[order(DESeq_sed_RefR1$padj, na.last=NA), ]
DESeq_sed_RefR2 = DESeq_sed_RefR2[order(DESeq_sed_RefR2$padj, na.last=NA), ]

# subset for further volcano figure only
volsed_REFR1 = cbind(as(DESeq_sed_RefR1, "data.frame"), as(tax_table(DESeqSed)[rownames(DESeq_sed_RefR1), ], "matrix"))
volsed_REFR2 = cbind(as(DESeq_sed_RefR2, "data.frame"), as(tax_table(DESeqSed)[rownames(DESeq_sed_RefR2), ], "matrix"))

# cutt-off p-value
alpha = 0.01

# filter only significantly abundance
sig_sed_RefR1 = DESeq_sed_RefR1[(DESeq_sed_RefR1$padj < alpha), ] 
sig_sed_RefR2 = DESeq_sed_RefR2[(DESeq_sed_RefR2$padj < alpha), ] 

# only significant sites to dataframe
DFsig_sed_RefR1= cbind(as(sig_sed_RefR1, "data.frame"), as(tax_table(DESeqSed)[rownames(sig_sed_RefR1), ], "matrix"))
DFsig_sed_RefR2= cbind(as(sig_sed_RefR2, "data.frame"), as(tax_table(DESeqSed)[rownames(sig_sed_RefR2), ], "matrix"))

# phyla order
x_sRefR1 = tapply(DFsig_sed_RefR1$log2FoldChange, DFsig_sed_RefR1$phylum, function(x) max(x))
x_sRefR2 = tapply(DFsig_sed_RefR2$log2FoldChange, DFsig_sed_RefR2$phylum, function(x) max(x))

x_sRefR1 = sort(x_sRefR1, TRUE)
x_sRefR2 = sort(x_sRefR2, TRUE)

DFsig_sed_RefR1$phylum = factor(as.character(DFsig_sed_RefR1$phylum), levels=names(x_sRefR1))
DFsig_sed_RefR2$phylum = factor(as.character(DFsig_sed_RefR2$phylum), levels=names(x_sRefR2))

# Genus order
x_sRefR1 = tapply(DFsig_sed_RefR1$log2FoldChange, DFsig_sed_RefR1$genus, function(x) max(x))
x_sRefR2 = tapply(DFsig_sed_RefR2$log2FoldChange, DFsig_sed_RefR2$genus, function(x) max(x))

x_sRefR1 = sort(x_sRefR1, TRUE)
x_sRefR2 = sort(x_sRefR2, TRUE)

DFsig_sed_RefR1$genus = factor(as.character(DFsig_sed_RefR1$genus), levels=names(x_sRefR1))
DFsig_sed_RefR2$genus = factor(as.character(DFsig_sed_RefR2$genus), levels=names(x_sRefR2))

# plotting

sed_RefR1<-ggplot(volsed_REFR1, aes(y= -log(padj), x=log2FoldChange)) + 
  geom_point(shape=1 , size= 1) +
  geom_point(data = DFsig_sed_RefR1, aes(y= -log(padj), x=log2FoldChange, col= genus), size=3)+
  geom_vline(xintercept = 0.0, color = "gray", size = 0.5) +
  lims(x = c(-22, 22), y = c(0, 20))+
  geom_text_repel(data= DFsig_sed_RefR1, aes(y= -log(padj), x=log2FoldChange, label=genus),box.padding= 0.2, max.overlaps= 50, min.segment.length= 0, size= 3.5) +
  ggtitle("Differential abundance Reference vs R1")+
  facet_wrap( ~ kingdom, scales = "free", ncol = 2) +
  theme(axis.text.x = element_text(angle = -90, hjust = 0, vjust=0.5))+
  theme_bw() + theme(legend.position="none")

sed_RefR2<-ggplot(volsed_REFR2, aes(y= -log(padj), x=log2FoldChange)) + 
  geom_point(shape=1 , size= 1) +
  geom_point(data = DFsig_sed_RefR2, aes(y= -log(padj), x=log2FoldChange, col= genus), size=3)+
  geom_vline(xintercept = 0.0, color = "gray", size = 0.5) +
  lims(x = c(-22, 22), y = c(0, 20))+
  geom_text_repel(data= DFsig_sed_RefR2, aes(y= -log(padj), x=log2FoldChange, label=genus),box.padding= 0.2, max.overlaps= 50, min.segment.length= 0, size= 3.5) +
  ggtitle("Differential abundance Reference vs R2")+
  facet_wrap( ~ kingdom, scales = "free", ncol = 2) +
  theme(axis.text.x = element_text(angle = -90, hjust = 0, vjust=0.5))+
  theme_bw() + theme(legend.position="none")

grid.arrange(sed_RefR1, sed_RefR2, ncol = 1)

# Clean table with DAA taxa
CleanListRefR1<- volsed_REFR1 %>%
  rownames_to_column("OTU") %>%
  select(OTU, baseMean, log2FoldChange, lfcSE, padj, kingdom, phylum, genus, species) %>%
  add_column(Comparison="RefvsR1")%>%
  filter(!genus == "NA") %>%
  filter (! duplicated(genus)) %>%
  filter(padj < 0.01) %>%
  arrange(desc(log2FoldChange))

CleanListRefR2<- volsed_REFR2 %>%
  rownames_to_column("OTU") %>%
  select(OTU, baseMean, log2FoldChange, lfcSE, padj, kingdom, phylum, genus, species) %>%
  add_column(Comparison="RefvsR2")%>%
  filter(!genus == "NA") %>%
  filter (! duplicated(genus)) %>%
  filter(padj < 0.01) %>%
  arrange(desc(log2FoldChange))

# merged both sets in order to plot facet_wrap
DAA_table<- bind_rows(CleanListRefR1, CleanListRefR2)
DAA_table
##          OTU  baseMean log2FoldChange     lfcSE           padj  kingdom
## 1    OTU_763  4.556839      14.394049 2.5007709 0.000003804712  Archaea
## 2    OTU_526  6.141868      13.875486 4.0127869 0.009363765333 Bacteria
## 3  OTU_20620 11.111793       7.809461 1.5460494 0.000053367503 Bacteria
## 4    OTU_206 14.250780       7.025307 1.5368282 0.000320872578 Bacteria
## 5  OTU_11145  9.795910       6.890874 1.5352301 0.000412902094 Bacteria
## 6   OTU_1389  6.400034       6.881370 1.8634198 0.004608221013 Bacteria
## 7   OTU_1044  5.721028       6.306234 1.5422662 0.001529971788 Bacteria
## 8    OTU_300  7.097668       6.268286 1.7648714 0.007137619620 Bacteria
## 9    OTU_204 16.931964       5.801542 1.1469941 0.000053367503 Bacteria
## 10   OTU_316 18.955850       5.796918 1.6115887 0.006457160605  Archaea
## 11    OTU_84 33.936217       5.726901 1.1631121 0.000093364344 Bacteria
## 12   OTU_640  5.125785       5.610460 1.5197469 0.004608221013 Bacteria
## 13    OTU_65 31.302579       5.175624 1.1036791 0.000201520071 Bacteria
## 14   OTU_137 19.204046       5.063028 1.2087422 0.001238329669 Bacteria
## 15    OTU_31 66.503244       4.922465 1.0641501 0.000260125493 Bacteria
## 16   OTU_470 11.119409       4.863456 1.1254924 0.000761036754 Bacteria
## 17   OTU_718  7.332455       4.341013 1.2262635 0.007256209081 Bacteria
## 18   OTU_234 13.051993       4.116054 1.0270568 0.001980621992 Bacteria
## 19   OTU_103 27.215777       3.442949 0.8362521 0.001447531177 Bacteria
## 20   OTU_150 26.692174       3.370588 0.9055738 0.004289206568 Bacteria
## 21   OTU_184 22.686861       3.233015 0.8537763 0.003545695197 Bacteria
## 22 OTU_12904 40.356340      -2.812419 0.6212056 0.000362350545 Bacteria
## 23    OTU_60 53.621198      -3.936770 0.8312654 0.000169870097 Bacteria
## 24  OTU_1852 18.191697      -4.035752 0.9253820 0.000658774755 Bacteria
## 25   OTU_131  9.316887      -5.699746 1.6021697 0.007081067985 Bacteria
## 26   OTU_122 69.694034      -6.655889 1.2687371 0.000029388000 Bacteria
## 27   OTU_422  9.675450      -7.165091 2.0231797 0.007256209081 Bacteria
## 28 OTU_11145  9.795910       7.828920 1.8140163 0.001239377089 Bacteria
## 29 OTU_20620 11.111793       7.785726 1.8254460 0.001373951090 Bacteria
## 30   OTU_470 11.119409       7.660599 1.7309876 0.001049127933 Bacteria
## 31   OTU_718  7.332455       7.199847 1.8215282 0.003475227292 Bacteria
## 32    OTU_65 31.302579       5.531390 1.2678978 0.001155498253 Bacteria
## 33   OTU_204 16.931964       5.095962 1.1913013 0.001373951090 Bacteria
## 34   OTU_137 19.204046       4.847971 1.3582251 0.009508165934 Bacteria
## 35   OTU_234 13.051993       4.496283 1.2567287 0.009421103143 Bacteria
## 36    OTU_31 66.503244       4.405127 1.1804029 0.006732479604 Bacteria
## 37   OTU_103 27.215777       3.659913 0.9865478 0.007131196135 Bacteria
## 38 OTU_12904 40.356340      -3.191471 0.6890374 0.000470887527 Bacteria
## 39    OTU_60 53.621198      -4.073121 0.9234985 0.001049127933 Bacteria
## 40  OTU_4697 13.560473      -4.533670 1.2539279 0.008685776443 Bacteria
## 41  OTU_1852 18.191697      -5.074818 1.0110419 0.000151486861 Bacteria
## 42   OTU_122 69.694034      -7.402127 1.4025404 0.000076479777 Bacteria
## 43   OTU_422  9.675450      -8.717819 2.1765675 0.003016858021 Bacteria
##             phylum                      genus    species Comparison
## 1    Euryarchaeota         Methanobrevibacter       <NA>    RefvsR1
## 2   Actinobacteria            Virgisporangium       <NA>    RefvsR1
## 3   Actinobacteria             Pseudonocardia       <NA>    RefvsR1
## 4   Actinobacteria             Friedmanniella       <NA>    RefvsR1
## 5   Proteobacteria                Amaricoccus       <NA>    RefvsR1
## 6   Planctomycetes               Planctomyces       <NA>    RefvsR1
## 7   Actinobacteria                Rubrobacter       <NA>    RefvsR1
## 8   Actinobacteria              Mycobacterium       <NA>    RefvsR1
## 9   Actinobacteria               Nocardioides       <NA>    RefvsR1
## 10   Crenarchaeota  Candidatus Nitrososphaera    SCA1145    RefvsR1
## 11  Proteobacteria                Balneimonas       <NA>    RefvsR1
## 12  Planctomycetes                    Gemmata       <NA>    RefvsR1
## 13  Proteobacteria           Rubellimicrobium       <NA>    RefvsR1
## 14  Actinobacteria               Microlunatus       <NA>    RefvsR1
## 15  Proteobacteria                Skermanella       <NA>    RefvsR1
## 16  Actinobacteria              Modestobacter       <NA>    RefvsR1
## 17  Proteobacteria                Methylibium       <NA>    RefvsR1
## 18  Proteobacteria              Methylotenera    mobilis    RefvsR1
## 19  Proteobacteria                Rhodoplanes       <NA>    RefvsR1
## 20     Nitrospirae                 Nitrospira       <NA>    RefvsR1
## 21  Actinobacteria                      Iamia       <NA>    RefvsR1
## 22  Proteobacteria               Sphingomonas  wittichii    RefvsR1
## 23  Proteobacteria                Rhodobacter       <NA>    RefvsR1
## 24  Proteobacteria             Hyphomicrobium       <NA>    RefvsR1
## 25  Proteobacteria                 Hyphomonas       <NA>    RefvsR1
## 26      Firmicutes            Exiguobacterium       <NA>    RefvsR1
## 27      Firmicutes             Planomicrobium       <NA>    RefvsR1
## 28  Proteobacteria                Amaricoccus       <NA>    RefvsR2
## 29  Actinobacteria             Pseudonocardia       <NA>    RefvsR2
## 30  Actinobacteria              Modestobacter       <NA>    RefvsR2
## 31  Proteobacteria                Methylibium       <NA>    RefvsR2
## 32  Proteobacteria           Rubellimicrobium       <NA>    RefvsR2
## 33  Actinobacteria               Nocardioides       <NA>    RefvsR2
## 34  Actinobacteria               Microlunatus       <NA>    RefvsR2
## 35  Proteobacteria              Methylotenera    mobilis    RefvsR2
## 36  Proteobacteria                Skermanella       <NA>    RefvsR2
## 37  Proteobacteria                Rhodoplanes       <NA>    RefvsR2
## 38  Proteobacteria               Sphingomonas  wittichii    RefvsR2
## 39  Proteobacteria                Rhodobacter       <NA>    RefvsR2
## 40     Nitrospirae                 Nitrospira       <NA>    RefvsR2
## 41  Proteobacteria             Hyphomicrobium       <NA>    RefvsR2
## 42      Firmicutes            Exiguobacterium       <NA>    RefvsR2
## 43      Firmicutes             Planomicrobium       <NA>    RefvsR2

Microbial function prediction

The database FAPROTAX was used to predict the biochemical cycles of environmental samples and analyze the metabolic function of bac terial communities. FAPROTAX database was created based on the information from the Bergey’s Manual of Systematic Bacteriology and related literatures, so the result of prokaryotic trait identification is reliable and conservative

FAPROTAX automatically match the taxonomic assignments of OTUs against the taxonomic information of the FAPROTAX database (Louca et al., 2016).

# Define a function to process data
process_data <- function(dataset, matrix_name) {
  # Conversion of phyloseq to microtable object
  meco_dataset <- phyloseq2meco(dataset)
  
  # Create object of trans_func
  t2 <- trans_func$new(meco_dataset)
  t2$for_what <- "prok"
  
  # Mapping the taxonomy to the database
  t2$cal_spe_func(prok_database = "FAPROTAX")
  
  # Calculate the percentages for communities
  t2$cal_spe_func_perc(abundance_weighted = TRUE)
  
  # Return the result
  return(data.frame(matrix = matrix_name, res_spe_func_perc = t2$res_spe_func_perc))
}

# Process data for each type
Function_water <- process_data(OBJwater, "Water")
Function_biofilm <- process_data(OBJbiofilm, "Biofilm")
Function_sedim <- process_data(OBJsed, "Sediment")

remove_prefix <- function(x) {
  str_replace(x, "^[^.]+\\.", "")
}

# Rename columns using the remove_prefix function
Function_water <- Function_water %>%
  rename_all(remove_prefix)
Function_biofilm <- Function_biofilm %>%
  rename_all(remove_prefix)
Function_sedim <- Function_sedim %>%
  rename_all(remove_prefix)
# Create a function for renaming and restructuring data
rename_and_restructure <- function(data) {
  col_names <- colnames(data)
  col_names <- str_replace_all(col_names, "_", " ")
  col_names <- str_replace(col_names, "^\\w{1}", toupper)
  
  data <- data %>%
    setNames(col_names) %>%
    rownames_to_column(var = "sample_id") %>%
    select(sample_id, everything())
  
  return(data)
}

# Apply the function to each dataset
Function_water <- rename_and_restructure(Function_water)
Function_sedim <- rename_and_restructure(Function_sedim)
Function_biofilm <- rename_and_restructure(Function_biofilm)
pivot_and_scale <- function(data, cols_to_pivot) {
  data <- pivot_longer(data, cols = all_of(cols_to_pivot), names_to = "pathways", values_to = "values") %>%
    filter(values > 0) %>%
    mutate(values = values / 100)
  return(data)
}

col.order_water <- colnames(Function_water)[3:71]
col.order_biofilm <- colnames(Function_biofilm)[3:55]
col.order_sedim <- colnames(Function_sedim)[3:70]


Function2_water <- pivot_and_scale(Function_water, col.order_water)
Function2_biofilm <- pivot_and_scale(Function_biofilm, col.order_biofilm)
Function2_sedim <- pivot_and_scale(Function_sedim, col.order_sedim)
label_sites <- function(data) {
  data <- data %>%
    mutate(site = case_when(
      startsWith(sample_id, "RS") ~ "RS",
      startsWith(sample_id, "R") ~ "R",
      startsWith(sample_id, "T") ~ "T"
    ))
  return(data)
}

Function3_water <- label_sites(Function2_water)
Function3_sedim <- label_sites(Function2_sedim)
Function3_biofilm <- label_sites(Function2_biofilm)

Pathways belonging to parasites and human-related were excluded.

calculate_means <- function(data) {
  data <- data %>%
    select(-sample_id) %>%
    group_by(Matrix, site, pathways) %>%
    summarise(
      mean = exp(mean(log(values))),
      sd = sd(values)
    )
  return(data)
}

Function4_water <- calculate_means(Function3_water)
Function4_sedim <- calculate_means(Function3_sedim)
Function4_biofilm <- calculate_means(Function3_biofilm)

# Merge the cleaned data frames
Function_all <- bind_rows(Function4_water, Function4_sedim, Function4_biofilm) %>%
  filter(!pathways %in% c("Animal parasites or symbionts", "Human gut", "Intracellular parasites", "Predatory or exoparasitic", "Human pathogens all", "Mammal gut", "Invertebrate parasites", "Human associated", "Human pathogens nosocomia", "Human pathogens pneumonia", "Human pathogens septicemia"))

Ecological nitrogen functioning

path.nitrogen<- c("Aerobic ammonia oxidation", "Aerobic nitrite oxidation", "Anammox", "Denitrification", "Nitrate ammonification", "Nitrate denitrification", "Nitrate reduction", "Nitrate respiration", "Nitrification", "Nitrite ammonification", "Nitrite denitrification", "Nitrite respiration", "Nitrogen fixation", "Nitrogen respiration", "Nitrous oxide denitrification")

# Plot the data
Function_all %>%
  mutate(Matrix = factor(Matrix, levels = c("Water", "Biofilm", "Sediment")),
         site = factor(site, levels = rev(site.order))) %>%
  filter(pathways %in% path.nitrogen) %>%
  ggplot(aes(x = pathways, y = mean, fill = site)) +
  geom_bar(stat = "identity", position = "dodge") +
  geom_errorbar(aes(ymin = mean, ymax = mean + sd), position = "dodge") +
  labs(y = "Relative abundance", x = "Bacterial ecological functions") +
  facet_wrap(vars(Matrix)) +
  theme_bw() +
  coord_flip() +
  scale_fill_manual(values = custom_colors)+  # Assign custom colors to site levels
  theme(axis.title.x=element_blank(),
        axis.text.x=element_text(angle=40,hjust=0.98,size=rel(1)),
        axis.text.y=element_text(size=rel(1)))

  • Nitrogen related pathways made up to 15% of the predictions
  • We observed again functional drops from reference levels.
    • Nitrogen nitrite, nitrate respiration in water
    • Nitrogen fixation in all compartments
  • Increases were also determined from reference conditions.
    • Nitrogen, nitrite, nitrate respiration in sediment
    • Overall increase nitrification in all compartments

Ecological methane functioning

path.methane<- c("Acetoclastic methanogenesis", "Hydrogenotrophic methanogenesis", "Methanogenesis", "Methanogenesis by CO2 reduction with H2", "Methanotrophy")

# Plot the data
Function_all %>%
  mutate(Matrix = factor(Matrix, levels = c("Water", "Biofilm", "Sediment")),
         site = factor(site, levels = rev(site.order))) %>%
  filter(pathways %in% path.methane) %>%
  ggplot(aes(x = pathways, y = mean, fill = site)) +
  geom_bar(stat = "identity", position = "dodge") +
  geom_errorbar(aes(ymin = mean, ymax = mean + sd), position = "dodge") +
  labs(y = "Relative abundance", x = "Bacterial ecological functions") +
  facet_wrap(vars(Matrix)) +
  theme_bw() +
  coord_flip() +
  scale_fill_manual(values = custom_colors)+  # Assign custom colors to site levels
  theme(axis.title.x=element_blank(),
        axis.text.x=element_text(angle=40,hjust=0.98,size=rel(1)),
        axis.text.y=element_text(size=rel(1)))